###############################################################################
#   Copyright : SyN Lab, University of Texas at San Antonio.                  #
#   Authors   : Prof.Palden Lama, Mr.Kumar Thummapudi, Prof.Rajendra Boppana. #
#   Usage     : python3 ml_model.py <model_name> <classifier> <input_csv_file>#
#       Possible models     :        hpc, io, combined                        #
#       Possible classifiers:   svm, knn, dt, rf, dnn, xgb.                   #
#   Note: The results may slightly vary based on the versions of the packages.#
###############################################################################
import pandas as pd
import itertools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import autokeras as ak
from tensorflow.keras.models import load_model
import sys
from xgboost import XGBClassifier
import os

os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

# Function to generate the results.
def report_results(df_out, y_test, predicted_y, mode, trial):
  cm = confusion_matrix(y_test, predicted_y, labels=[0,1])
  tn, fp, fn, tp = cm.ravel()
  fpr = fp/(tn+fp)
  fnr = fn/(tp+fn)
  balanced_accuracy = balanced_accuracy_score(y_test, predicted_y)
  precision, recall, fscore, _ = precision_recall_fscore_support(y_test, predicted_y, average='binary')
  from sklearn.metrics import accuracy_score
  auc = accuracy_score(y_test, predicted_y)
  curr_row = pd.Series({'balanced_accuracy': balanced_accuracy,\
          'f1-score': fscore,\
          'precision': precision,\
          'recall': recall,\
          'fpr': fpr,\
          'fnr': fnr,\
          'auc': auc})
  return pd.concat([df_out, curr_row.to_frame().T], ignore_index=True)

if __name__ == "__main__":

  # Ensure enough number of parameters are received as input.
  if len(sys.argv) == 4:
    model = sys.argv[1].lower()
    mode = sys.argv[2].lower()
    input_file = sys.argv[3]
  else:
    print("Usage: ", sys.argv[0], " <model> <classifier> <input_csv>")
    print(" Ex: ", sys.argv[0], " hpc rf data_agg.csv")
    exit(1)
    
  roundlist = [1,2,3,4,5,6,7]
  df = pd.read_csv(input_file)
  df_out = pd.DataFrame(columns=['balanced_accuracy','f1-score','precision', 'recall', 'fpr', 'fnr', 'auc'])
  
  # Pick the input data based on model specification.
  if model == "hpc":
    df = df.filter(regex='L1-icache-load-misses|LLC-stores|branch-load-misses|instructions|node-load-misses|SampleApp|SampleClass|Round')
  elif model == "io":
    df = df.filter(regex='rd_req|rd_bytes|wr_req|wr_bytes|flush_operations|rd_total_times|wr_total_times|flush_total_times|SampleApp|SampleClass|Round')
  elif model == "combined":
    df = df.drop(columns=['SampleLoad'])
  else:
    print("Possible models are: hpc/io/combined.\nPlease enter one of these models.")
    exit()

  trnapplist = ['133b', '17d1', '4f7b', '7fae', '7zip', 'sDel','aesC', 'DryR']

  trial = 0
  for x in itertools.combinations(roundlist, 5):
    trial = trial + 1
    trnround = x
    testround = tuple(set(roundlist) - set(trnround))
    data_train = df[df.SampleApp.isin(trnapplist) & \
                    df.Round.isin(trnround)].reset_index(drop=True)
    data_test = df[df.Round.isin(testround)].reset_index(drop=True)
    X_train = data_train.drop(columns=['SampleApp','SampleClass', 'Round'])
    y_train = data_train.SampleClass
    X_test = data_test.drop(columns=['SampleApp','SampleClass', 'Round'])
    y_test = data_test.SampleClass
    if trial == 1:
      print ("Training Apps: ", data_train.SampleApp.unique())
      print ("Testing Apps: ", data_test.SampleApp.unique())
      print("Training sample size (per iteration): ", X_train.shape)
      print("Testing sample size (per iteration): ", X_test.shape)
    
    # Data Normalization using StandardScaler.
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Fit the training data into the required classifier.
    if mode == 'rf':
      clf = RandomForestClassifier(random_state=0, n_jobs=-1)
      clf.fit(X_train, y_train)
    elif mode == 'xgb':
      clf = XGBClassifier(random_state=0, n_jobs=128)
      clf.fit(X_train, y_train)
    elif mode == 'dnn':   
      clf = ak.StructuredDataClassifier(overwrite=True, max_trials=1)
      clf.fit(X_train, y_train, epochs=10)
      model = clf.export_model()
      model.summary()
    elif mode == 'knn':
      clf = KNeighborsClassifier(n_neighbors = 5, n_jobs=-1)
      clf.fit(X_train, y_train)
    elif mode == 'dt':
      clf = DecisionTreeClassifier(random_state=0)
      clf.fit(X_train, y_train)
    elif mode == 'svm':
      clf = svm.SVC(probability=True, random_state=0)
      clf.fit(X_train, y_train)
    else:
      print("Possible classifers are: rf/xgb/dnn/knn/dt/svm.\nPlease enter one of these classifiers.")
      exit()


    
    # Predict the testing data using the constructed classifier.
    predicted_y = clf.predict(X_test)
    df_out = report_results(df_out, y_test, predicted_y, mode, trial)

  # Print the performance statistics as average of all 21 iterations rounded to three decimals.
  print("Total Iterations Completed: ", trial)
  print(df_out.mean(axis=0).round(decimals=3))
    

    
    
    
